import pandas as pd
import json
import numpy as np
from itertools import repeat
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go
from concurrent.futures import wait as futures_wait
from concurrent.futures.process import ProcessPoolExecutor
import importlib
import hashtag_util as ut
import sys
sys.path.insert(0, '../')
import general_utils as gen_ut
df = pd.read_csv('../tweets_novax.csv',low_memory=False,
usecols=['user_screen_name','hashtags','created_at'])
df['created_at'] = pd.to_datetime(df['created_at'], format="%a %b %d %X %z %Y")
#Creating a map of all hashtags with the number of uses
listHashtags = []
for s in df['hashtags']:
[ listHashtags.append(x) for x in gen_ut.get_string_json(s,'text') ]
dfHashtags = pd.DataFrame()
dfHashtags['hashtags'] = listHashtags
dfHashtags['count'] = 0
dfHashtags = dfHashtags.groupby('hashtags').count()
dfHashtags.sort_values(['count'],axis = 0,inplace=True,ascending=False)
dfHashtags
| count | |
|---|---|
| hashtags | |
| vaccino | 40214 |
| vaccini | 25023 |
| COVID19 | 18250 |
| AstraZeneca | 12912 |
| Pfizer | 11718 |
| ... | ... |
| appoggiati | 1 |
| approfondimento | 1 |
| aprireebasta | 1 |
| CuiProdest | 1 |
| 01Marzo | 1 |
19258 rows × 1 columns
importlib.reload(ut)
dfUse = ut.process_dfUse(df)
dfUse
| Week/Year | hashtag | count | |
|---|---|---|---|
| 35629 | 2020-01-06 | AIFA | 163 |
| 35633 | 2020-01-06 | Brusaferro | 1 |
| 35634 | 2020-01-06 | Burioni | 13 |
| 35635 | 2020-01-06 | CercasiCavieUmane | 2 |
| 35636 | 2020-01-06 | Di | 1 |
| ... | ... | ... | ... |
| 4038 | 2021-05-17 | Texas | 3 |
| 4037 | 2021-05-17 | Terroristi | 2 |
| 4036 | 2021-05-17 | TerapiaGenicaSperimentale | 1 |
| 4100 | 2021-05-17 | agoràrai | 1 |
| 3959 | 2021-05-17 | Regime | 2 |
51849 rows × 3 columns
importlib.reload(ut)
ut.visual_histogram(dfHashtags,200,50)
ut.visual_by_date_together(dfHashtags,dfUse)
ut.visual_by_date_split(dfHashtags,dfUse)
hastagRemove = ['vaccin.*','covid.*','corona.*','astrazeneca','pfizer','sarscov2','sputnikv','moderna']
dfHashtagFiltered = dfHashtags
for r in hastagRemove:
mask = dfHashtagFiltered.index.str.lower().str.match(r) == True
dfHashtagFiltered.drop(dfHashtagFiltered[mask].index, inplace=True)
dfHashtagFiltered
| count | |
|---|---|
| hashtags | |
| Conte | 5087 |
| Speranza | 4572 |
| lockdown | 4492 |
| Arcuri | 3900 |
| NessunaCorrelazione | 3458 |
| ... | ... |
| appoggiati | 1 |
| approfondimento | 1 |
| aprireebasta | 1 |
| CuiProdest | 1 |
| 01Marzo | 1 |
18458 rows × 1 columns
ut.visual_histogram(dfHashtagFiltered,100,50)
ut.visual_by_date_together(dfHashtagFiltered,dfUse)
ut.visual_by_date_split(dfHashtagFiltered,dfUse)
dfMoreFiltered = dfHashtagFiltered
hastagRemove = ['.*lombardia.*','draghi','conte','m5s','mattarella','salvini','speranza','renzi','lega','.*governo.*',
'.*moratti.*','zingaretti','scanzi','burioni','crisanti']
for r in hastagRemove:
mask = dfMoreFiltered.index.str.lower().str.match(r) == True
dfMoreFiltered.drop(dfMoreFiltered[mask].index, inplace=True)
dfMoreFiltered
| count | |
|---|---|
| hashtags | |
| lockdown | 4492 |
| Arcuri | 3900 |
| NessunaCorrelazione | 3458 |
| BillGates | 3403 |
| staseraitalia | 3091 |
| ... | ... |
| appoggiati | 1 |
| approfondimento | 1 |
| aprireebasta | 1 |
| CuiProdest | 1 |
| 01Marzo | 1 |
17964 rows × 1 columns
ut.visual_histogram(dfMoreFiltered,100,50)
ut.visual_by_date_together(dfMoreFiltered,dfUse)
ut.visual_by_date_split(dfMoreFiltered,dfUse)
listHashtagsStudy = ['5g','billgates','dittatura*.','disobbedisco','nessunacorrelazione','byoblu*.']
dfSuspect = pd.DataFrame(index=listHashtagsStudy)
for r in listHashtagsStudy:
mask = dfMoreFiltered.index.str.lower().str.match(r) == True
dfSuspect.loc[r,'count'] = sum(dfMoreFiltered.loc[mask,'count'])
dfSuspect.sort_values('count',inplace=True)
fig = px.histogram(y=dfSuspect.index, x=dfSuspect['count']*100/sum(dfHashtags['count']), orientation='h')
fig.update_layout(title="Use of suspect hashtag (as a proportion of the total)]")
fig.update_yaxes(title="Hashtag")
fig.update_xaxes(title="Usage percent")
fig.show()
fig = px.histogram(y=dfSuspect.index, x=dfSuspect['count'], orientation='h')
fig.update_layout(title="Use of suspect hashtag (total = %d)]"%sum(dfHashtags['count']))
fig.update_yaxes(title="Hashtag")
fig.update_xaxes(title="Usage")
fig.show()
ut.visual_by_date_together(dfSuspect,dfUse)
dfUseSus = ut.process_df_uses_hashtags(df,dfSuspect.index)
dfUseSus = dfUseSus.groupby('user').any()
dfUseSus = ut.hashtagAND(dfSuspect.index,dfUseSus)
for i in range(1,len(listHashtagsStudy)):
dfUseSus = ut.hashtagOR(dfSuspect.index,dfUseSus,'OR'+str(i),i)
dfUseSus
| disobbedisco | 5g | byoblu*. | dittatura*. | billgates | nessunacorrelazione | AND | OR1 | OR2 | OR3 | OR4 | OR5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| user | ||||||||||||
| 00000o0OOOO00 | False | False | False | False | False | False | False | False | False | False | False | False |
| 09Espanol | False | False | False | False | False | False | False | False | False | False | False | False |
| 0Zedda | False | False | False | False | False | False | False | False | False | False | False | False |
| 1000whitecranes | False | False | False | False | False | False | False | False | False | False | False | False |
| 10clarenc3 | False | False | False | True | False | True | False | True | True | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| zittononcisto | False | False | False | False | True | False | False | True | False | False | False | False |
| zizionice | False | False | False | False | False | False | False | False | False | False | False | False |
| zuccaro_sonia | False | False | False | False | False | False | False | False | False | False | False | False |
| zxuz53 | False | False | False | False | False | False | False | False | False | False | False | False |
| zziocane66 | False | True | False | False | True | True | False | True | True | True | False | False |
8026 rows × 12 columns
print("Number of account noVax (in dataframe noVax) that uses at least i hashtags")
for i in range (1,6):
or_i = "OR%d"%i
dfUseHashtagNovax = dfUseSus[dfUseSus[or_i]]
print("\ti =",i,":\t",(len(dfUseHashtagNovax) / len(dfUseSus))*100,"%")
Number of account noVax (in dataframe noVax) that uses at least i hashtags i = 1 : 41.789185148268125 % i = 2 : 15.312733615748817 % i = 3 : 5.806130077248941 % i = 4 : 1.8813854971343136 % i = 5 : 0.3862447047096935 %
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')
import os
if os.system("jupyter nbconvert %s --to html"%nb_name)==0:
print("Notebook converted correctly")
else:
print("Notebook convertion had an error")
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-24-2677802532fc> in <module> 1 import os ----> 2 if os.system("jupyter nbconvert %s --to html"%nb_name)==0: 3 print("Notebook converted correctly") 4 else: 5 print("Notebook convertion had an error") NameError: name 'nb_name' is not defined